﻿using System;
using System.Collections.Generic;
using Microsoft.ML;
using Microsoft.ML.Data;

namespace Samples.Dynamic
{
    public static partial class TransformSamples
    {
        public static void Example()
        {
            // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, 
            // as well as the source of randomness.
            var ml = new MLContext();

            // Get a small dataset as an IEnumerable and convert to IDataView.
            var data = new List<SampleSentimentData>() {
                new SampleSentimentData { Sentiment = true, SentimentText = "Best game I've ever played." },
                new SampleSentimentData { Sentiment = false, SentimentText = "==RUDE== Dude, 2" },
                new SampleSentimentData { Sentiment = true, SentimentText = "Until the next game, this is the best Xbox game!" } };

            // Convert IEnumerable to IDataView.
            var trainData = ml.Data.LoadFromEnumerable(data);

            // Preview of the data.
            //
            // Sentiment    SentimentText
            // true         Best game I've ever played.
            // false        ==RUDE== Dude, 2.
            // true          Until the next game, this is the best Xbox game!

            // A pipeline to tokenize text as characters and then combine them together into n-grams
            // The pipeline uses the default settings to featurize.

            var charsPipeline = ml.Transforms.Text.TokenizeIntoCharactersAsKeys("Chars", "SentimentText", useMarkerCharacters: false);
            var ngramOnePipeline = ml.Transforms.Text.ProduceNgrams("CharsUnigrams", "Chars", ngramLength: 1);
            var ngramTwpPipeline = ml.Transforms.Text.ProduceNgrams("CharsTwograms", "Chars");
            var oneCharsPipeline = charsPipeline.Append(ngramOnePipeline);
            var twoCharsPipeline = charsPipeline.Append(ngramTwpPipeline);

            // The transformed data for pipelines.
            var transformedData_onechars = oneCharsPipeline.Fit(trainData).Transform(trainData);
            var transformedData_twochars = twoCharsPipeline.Fit(trainData).Transform(trainData);

            // Small helper to print the text inside the columns, in the console. 
            Action<string, IEnumerable<VBuffer<float>>, VBuffer<ReadOnlyMemory<char>>> printHelper = (columnName, column, names) =>
            {
                Console.WriteLine($"{columnName} column obtained post-transformation.");
                var slots = names.GetValues();
                foreach (var featureRow in column)
                {
                    foreach (var item in featureRow.Items())
                        Console.Write($"'{slots[item.Key]}' - {item.Value} ");
                    Console.WriteLine("");
                }

                Console.WriteLine("===================================================");
            };
            // Preview of the CharsUnigrams column obtained after processing the input.
            VBuffer<ReadOnlyMemory<char>> slotNames = default;
            transformedData_onechars.Schema["CharsUnigrams"].GetSlotNames(ref slotNames);
            var charsOneGramColumn = transformedData_onechars.GetColumn<VBuffer<float>>(transformedData_onechars.Schema["CharsUnigrams"]);
            printHelper("CharsUnigrams", charsOneGramColumn, slotNames);

            // CharsUnigrams column obtained post-transformation.
            // 'B' - 1 'e' - 6 's' - 1 't' - 1 '<?>' - 4 'g' - 1 'a' - 2 'm' - 1 'I' - 1 ''' - 1 'v' - 2 ...
            // 'e' - 1 '<?>' - 2 'd' - 1 '=' - 4 'R' - 1 'U' - 1 'D' - 2 'E' - 1 'u' - 1 ',' - 1 '2' - 1
            // 'B' - 0 'e' - 6 's' - 3 't' - 6 '<?>' - 9 'g' - 2 'a' - 2 'm' - 2 'I' - 0 ''' - 0 'v' - 0 ...
            // Preview of the CharsTwoGrams column obtained after processing the input.
            var charsTwoGramColumn = transformedData_twochars.GetColumn<VBuffer<float>>(transformedData_twochars.Schema["CharsTwograms"]);
            transformedData_twochars.Schema["CharsTwograms"].GetSlotNames(ref slotNames);
            printHelper("CharsTwograms", charsTwoGramColumn, slotNames);

            // CharsTwograms column obtained post-transformation.
            // 'B' - 1 'B|e' - 1 'e' - 6 'e|s' - 1 's' - 1 's|t' - 1 't' - 1 't|<?>' - 1 '<?>' - 4 '<?>|g' - 1 ...
            // 'e' - 1 '<?>' - 2 'd' - 1 '=' - 4 '=|=' - 2 '=|R' - 1 'R' - 1 'R|U' - 1 'U' - 1 'U|D' - 1 'D' - 2 ...
            // 'B' - 0 'B|e' - 0 'e' - 6 'e|s' - 1 's' - 3 's|t' - 1 't' - 6 't|<?>' - 2 '<?>' - 9 '<?>|g' - 2 ...
        }

        /// <summary>
        /// A dataset that contains a tweet and the sentiment assigned to that tweet: 0 - negative and 1 - positive sentiment.
        /// </summary>
        public class SampleSentimentData
        {
            public bool Sentiment { get; set; }
            public string SentimentText { get; set; }
        }
    }
}
